##########################################################
# John Henderson and Alex Theodoridis
# Replication Data for: "Seeing Spots", 
#  Forthcoming in Political Behavior, August 20, 2017
# 
##########################################################
#
#  pre_data_vagov.R
#  -- file produces outcome and treatment obejcts prior to analysis
#
##########################################################

# Outcomes 
#  - all_y - summary of four info-seeking choices (1-video_skipped,replay,share,getlink)
#  - video_skipped - skip video or not
#  - replay - replay video or not                           
#  - share - share wit friend
#  - getlink - link to see videos like this 
#  - time_watched - total time watched first pass
#  - total_time

#  - obama_pos, obama_neg, romney_pos, romney_neg, neg_ad, pos_ad, obama_ad, romney_ad
#  - pid3, pid7, pid_lean, tr, vids

rm(list=ls())
library(foreign)
library(stringr)

va_data=read.csv(file='~/Dropbox/Seeing_Spots/replication/va_data_gov.csv',header=T,stringsAsFactors=F)  
zip_outs=read.csv(file='~/Dropbox/Seeing_Spots/replication/zip_outs.csv',header=F,stringsAsFactors=F)  
cnty_returns=read.csv(file='~/Dropbox/Seeing_Spots/replication/cnty_returns.csv',header=T,stringsAsFactors=F)  

# skipped @ 30 secs               
va_data$video_skipped=array(NA,length(va_data[[1]]))
va_data$video_skipped[which(va_data$video_time<60)]=1
va_data$video_skipped[which(va_data$video_time>60)]=0

va_data$video_skipped_alt=array(NA,length(va_data[[1]]))
va_data$video_skipped_alt[which(va_data$video_time<30)]=1
va_data$video_skipped_alt[which(va_data$video_time>30)]=0                                                        
 
# video_skipped_60 is the 60 second behavior...

va_data$video_skipped_60=va_data$video_skipped
va_data$video_skipped=va_data$video_skipped_alt

# treatment measures
tr=as.character(va_data$videos_treat)
pos_ad=dem_ad=rep_ad=dem_pos=dem_neg=rep_pos=rep_neg=neg_ad=array(NA,length(tr))

dem_ad = str_sub(tr,1,1)=='M'
rep_ad = str_sub(tr,1,1)=='C'

pos_ad = str_sub(tr,2,2) == "P"
neg_ad = str_sub(tr,2,2) == "N"

dem_pos = pos_ad == T & dem_ad == T
dem_neg = neg_ad == T & dem_ad == T
rep_pos = pos_ad == T & rep_ad == T
rep_neg = neg_ad == T & rep_ad == T
             
tr_full=0*dem_pos+1*dem_neg+2*rep_pos+3*rep_neg


# pid_lean is va_data$pid3lean
# pid7 is va_data$pid7zero  
pid3 = as.numeric(va_data$pid3lean)
pid3[which(pid3==1)] = -1
pid3[which(pid3==2)] = 0
pid3[which(pid3==3)] = 1
pid3[which(pid3>1)] = NA

pid7 = as.numeric(va_data$pid7_pre)
pid7[which(pid7>7)] = NA
pid7 = pid7 - 4

pid_lean = sign(pid7)


# political interest measures 

sen_majority_correct=hou_majority_correct = array(NA,length(va_data$majority_senat_pre))
hou_majority_correct[as.numeric(va_data$majority_house_pre)==1] = T
hou_majority_correct[as.numeric(va_data$majority_house_pre)!=1] = F

sen_majority_correct[as.numeric(va_data$majority_senat_pre)==2] = T
sen_majority_correct[as.numeric(va_data$majority_senat_pre)!=2] = F

majority = hou_majority_correct + sen_majority_correct

newsint = as.numeric(va_data$newsint_pre)
newsint[as.numeric(va_data$newsint_pre)>4] = 4
newsint = -(newsint - 4)

#
#Name:         newsint         
#Description:  Interest in news and public affairs
#
#        Count Code Label
#        ----- ---- -----
#          569    1 Most of the time
#          243    2 Some of the time
#          115    3 Only now and then
#           53    4 Hardly at all
#           18    7 Don't know
#            2    8 Skipped
#            0    9 Not Asked


# outcomes
# skipping and video time

video_skipped = va_data$video_skipped
video_skipped_alt = va_data$video_skipped
video_skipped_60=va_data$video_skipped_60
video_skipped_alt[which(video_skipped==1 & as.numeric(va_data$video_time)>=25)] = 0

# proportion of ads skipped by type:          
skipads=tapply(video_skipped,str_sub(tr),mean)
#array(0,12)                            
#for(j in 1:12){
#	skipads[j]=mean(video_skipped[as.numeric(tr)==j],na.rm=T)         
#}


# not skipping is if video time > 30 
# skipping is if video time <= 30
# trim time if took longer than 60 seconds [not sure these should be NA and not 0 on skipping]

# time watched 
time_watched = as.numeric(va_data$video_time)
time_watched[which(time_watched<0)] = NA
time_float1 = time_watched
time_watched[which(time_watched > 90)] = NA

time_again = as.numeric(va_data$video_time2)
time_again[which(time_again<0)] = NA
time_float2 = time_again
time_again[which(time_again > 90)] = NA

total_time = time_watched #+ time_again
total_float = time_float1 #+ time_float2


# replay is given not skipped; watch again
replay = as.numeric(va_data$videoagain) #replay)
replay[which(replay>2)] = NA
replay[which(replay==2)] = 0

# share is share with friend, etc
share = as.numeric(va_data$AGT309)
share[which(share>2)] = NA
share[which(share==2)] = 0

# getlink is ask for a video link
getlink = as.numeric(va_data$AGT310)
getlink[which(getlink>2)] = NA
getlink[which(getlink==2)] = 0
    
# additive scale of 4 information seeking items 
all_y=cbind(1-video_skipped,share,replay,getlink)
all_y=rowSums(all_y,na.rm=T)
all_y[which(is.na(video_skipped) & is.na(replay) & is.na(share) & is.na(getlink))]=NA
#all_y=all_y/max(all_y,na.rm=T)

# additive scale of 4 information seeking items 
all_y_60=cbind(1-video_skipped_60,share,replay,getlink)
all_y_60=rowSums(all_y_60,na.rm=T)
all_y_60[which(is.na(video_skipped_60) & is.na(replay) & is.na(share) & is.na(getlink))]=NA
#all_y=all_y/max(all_y,na.rm=T)

# reliability    
do.reliability=F
if(do.reliability==T){
	library(psych)
	rel_y=cbind(1-video_skipped,share,replay,getlink)  
	alpha(rel_y[obama_ad,]) 

	# about .49
	set.seed(1005)                      
	for(k in 1:ncol(rel_y)){
		rel_y[which(is.na(rel_y[,k])),k]=sample(c(0,1),prob=table(rel_y[which(!is.na(rel_y[,k])),k])/sum(table(rel_y[which(!is.na(rel_y[,k])),k])),size=length(rel_y[which(is.na(rel_y[,k])),k]),replace=T)
	}
           
	distMat=dist(rel_y)    
	all_sum=cmdscale(distMat,k=1)   
	cor(all_sum,rel_y) 
	# dimensionality dominated by not-skipping outcome
}

# END pre_data_vagov.R    